#R libraries

library(knitr)
library(yaml)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
require(graphics)
library(tidyverse)
## -- Attaching packages -------------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggthemes)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(stringr)
library(stringi)
library(mapproj)
library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(readr)
library(rio)
## 
## Attaching package: 'rio'
## The following object is masked from 'package:plotly':
## 
##     export
library(naniar)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(grid)
library(mice)
## Warning: package 'mice' was built under R version 4.0.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:RCurl':
## 
##     complete
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
library(datasets)

#Beer and Breweries data input from CSV files

# This reads in the Beers data from select folder file Beers.csv.
Beerdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Beers.csv",
                     header = T,sep = ",",na.strings = "NA",fill = TRUE)
head(Beerdata)
##                  Name Beer_ID   ABV IBU Brewery_id
## 1            Pub Beer    1436 0.050  NA        409
## 2         Devil's Cup    2265 0.066  NA        178
## 3 Rise of the Phoenix    2264 0.071  NA        178
## 4            Sinister    2263 0.090  NA        178
## 5       Sex and Candy    2262 0.075  NA        178
## 6        Black Exodus    2261 0.077  NA        178
##                            Style Ounces
## 1            American Pale Lager     12
## 2        American Pale Ale (APA)     12
## 3                   American IPA     12
## 4 American Double / Imperial IPA     12
## 5                   American IPA     12
## 6                  Oatmeal Stout     12

#Number of rows in Beer data

nrow(Beerdata)
## [1] 2410
# This reads in the Beers data from select folder file Breweries.csv.
Breweriesdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Breweries.csv",
                          header = T,sep = ",",na.strings = "NA", fill = TRUE)
head(Breweriesdata)
##   Brew_ID                      Name          City State
## 1       1        NorthGate Brewing    Minneapolis    MN
## 2       2 Against the Grain Brewery    Louisville    KY
## 3       3  Jack's Abby Craft Lagers    Framingham    MA
## 4       4 Mike Hess Brewing Company     San Diego    CA
## 5       5   Fort Point Beer Company San Francisco    CA
## 6       6     COAST Brewing Company    Charleston    SC

#Number of rows in Breweries data

nrow(Breweriesdata)
## [1] 558

#Generating geographic plots from breweries data (following chunk generate the final plot)

#makes a data frame with State name and abbreviation.
lookup = data.frame(abb = state.abb, State = state.name)
dc <- c("DC", "District of Columbia")
lookup <- rbind(lookup,dc)
# Change Column Name State to abb (abbreviation)
colnames(Breweriesdata)[4] = "abb" 

# Removes left space in state abb data taken from Breweries CSV
Breweriesdata$abb <- trimws(Breweriesdata$abb,which = c("left"))  

# make one dataset with state names and abb
Brewdata <- merge(Breweriesdata,lookup,"abb") 

Brewcount <- count(Brewdata,State,abb) #count up the occurrence of each state

colnames(Brewcount)[3] = "Breweries_Count" #change "n" to "Breweries_Count"

# added state name also changed to lower case
Brewcount$region <- tolower(Brewcount$State) 

Brewcount2 = Brewcount[-1] #removed first column from Brew count data state name 

states <- map_data("state") #contains data for states excluding Alaska, Hawaii

#Added Alaska to states as the states data does not include Alaska (Hawaii not in data so did not include in below data)
Alaska <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Alaska.csv",header = T,sep = ",",na.strings = "NA",
                          fill = TRUE);
states <- rbind(states,Alaska)

#map.df is data frame containing longitude and latitude to form state and breweries count by state
map.df <- merge(states,Brewcount2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]

#Breweries count by state using gradient graphics
plot <- map.df %>% ggplot(aes(x=long, y=lat, group = group)) +
  geom_polygon(aes(fill = Breweries_Count)) +  geom_path() + 
  scale_fill_gradientn(colours=rev(topo.colors(10)),na.value="grey90",
                       breaks=c(0,5,10,15,20,25,30,35,40,45,50))+
  ggtitle("Breweries Count by State") + coord_map()

#state center longitude and latitude (read data from csv file)
st_center <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/long_lat_statecenter.csv",header = T,sep = ",",na.strings = "NA",
                          fill = TRUE);
# state names changed to lower case
st_center$region <- tolower(st_center$region) 

#map.df1 merges state center with Breweries count data
map.df1 <- merge(st_center,Brewcount2, by="region", all.x = T)

# Adding state names and Breweries count data by state 
Nplot <- plot + geom_point(data=map.df1, aes(x=long, y=lat, 
                                             size = Breweries_Count, group = region),
                           show.legend = FALSE) + 
  geom_text(aes( x= long, y = lat,label = abb.x, group = region), 
            data = map.df1, vjust=-1.2) +
  geom_text(aes(x = long, y = lat, label = Breweries_Count, group = region), 
            data = map.df1,vjust=1.8)


#Code below adds scaled legend to the Breweries count by state
cplot <- Nplot + theme(legend.background = element_rect(fill="gray90", 
                                                        size=10, linetype="dotted"),
                       legend.key.height = unit(4,"lines")) + 
  theme(plot.title = element_text(size=14, face= "bold", colour= "black"))

#Final plot with state names and Breweries count by state

cplot  #plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)

The final plot shows that ‘Colorado’ has the highest (47)number of breweries followed by ‘California (39)’, ‘Michigan(32)’ and ‘Oregon(29)’ being the next three successors.

#Merging beer data and breweries data

colnames(Beerdata)[5] = "Brew_ID" #Changing columun name to combine both beer data and breweries data
Totaldata <- merge(Beerdata,Breweriesdata, by="Brew_ID", all.x = T) %>% na_if("")
colnames(Totaldata)[2] = "Beer_Name" #Changing the column name to Beer name
colnames(Totaldata)[8] = "Brewery_Name" # Changing the column name to Brewery name
colnames(Totaldata)[10] = "State" # Changing the column name back to State

#First 6 observations after merging beer data and breweries data

head(Totaldata,6)
##   Brew_ID     Beer_Name Beer_ID   ABV IBU                               Style
## 1       1  Get Together    2692 0.045  50                        American IPA
## 2       1 Maggie's Leap    2691 0.049  26                  Milk / Sweet Stout
## 3       1    Wall's End    2690 0.048  19                   English Brown Ale
## 4       1       Pumpion    2689 0.060  38                         Pumpkin Ale
## 5       1    Stronghold    2688 0.060  25                     American Porter
## 6       1   Parapet ESB    2687 0.056  47 Extra Special / Strong Bitter (ESB)
##   Ounces       Brewery_Name        City State
## 1     16 NorthGate Brewing  Minneapolis    MN
## 2     16 NorthGate Brewing  Minneapolis    MN
## 3     16 NorthGate Brewing  Minneapolis    MN
## 4     16 NorthGate Brewing  Minneapolis    MN
## 5     16 NorthGate Brewing  Minneapolis    MN
## 6     16 NorthGate Brewing  Minneapolis    MN

#Last 6 observations after merging beer data and breweries data

tail(Totaldata,6)
##      Brew_ID                 Beer_Name Beer_ID   ABV IBU
## 2405     556             Pilsner Ukiah      98 0.055  NA
## 2406     557  Heinnieweisse Weissebier      52 0.049  NA
## 2407     557           Snapperhead IPA      51 0.068  NA
## 2408     557         Moo Thunder Stout      50 0.049  NA
## 2409     557         Porkslap Pale Ale      49 0.043  NA
## 2410     558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                  Brewery_Name          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK

#Missing Values in ABV/IBU columns

table(is.na(Totaldata$ABV)) # True value represents NA values in the column ABV in original data
## 
## FALSE  TRUE 
##  2348    62
table(is.na(Totaldata$IBU)) # True value represents NA values in the column IBU in original data
## 
## FALSE  TRUE 
##  1405  1005
table(is.na(Totaldata$Style)) # True value represents NA values in the column Style in original data
## 
## FALSE  TRUE 
##  2405     5
gg_miss_var(Totaldata)+ ggtitle("Missing data values by Category")+scale_y_log10() + theme(
# AXIS LABLES APPEARANCE
plot.title = element_text(size=14, face= "bold", colour= "black" ),
axis.title.x = element_text(size=12, face="bold", colour = "black"),    
axis.title.y = element_text(size=12, face="bold", colour = "black"),    
axis.text.x = element_text(size=12, face="bold", colour = "black"), 
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

The plot and the values in tables above provide the missing data values in ABV (65), IBU (1005) and Style (5) columns from originally considered beer data.

#Addressing the missing values

#MICE - Multivariate Imputation by Chained Equations method is used to impute the missing variables in ABV and IBU columns
imp <- mice(Totaldata, method='norm.predict', m=5)
## 
##  iter imp variable
##   1   1  ABV  IBU
##   1   2  ABV  IBU
##   1   3  ABV  IBU
##   1   4  ABV  IBU
##   1   5  ABV  IBU
##   2   1  ABV  IBU
##   2   2  ABV  IBU
##   2   3  ABV  IBU
##   2   4  ABV  IBU
##   2   5  ABV  IBU
##   3   1  ABV  IBU
##   3   2  ABV  IBU
##   3   3  ABV  IBU
##   3   4  ABV  IBU
##   3   5  ABV  IBU
##   4   1  ABV  IBU
##   4   2  ABV  IBU
##   4   3  ABV  IBU
##   4   4  ABV  IBU
##   4   5  ABV  IBU
##   5   1  ABV  IBU
##   5   2  ABV  IBU
##   5   3  ABV  IBU
##   5   4  ABV  IBU
##   5   5  ABV  IBU
## Warning: Number of logged events: 5
Totaldata_imp=complete(imp)
gg_miss_var(Totaldata_imp) # shows only style as missing data

table(is.na(Totaldata_imp$Style)) # True value represents NA values in the column Style in original data
## 
## FALSE  TRUE 
##  2405     5
Totaldata_imp[c(which(is.na(Totaldata_imp$Style))),]
##     Brew_ID                      Beer_Name Beer_ID        ABV      IBU Style
## 227      30                Special Release    2210 0.06286884 46.13160  <NA>
## 455      67                  OktoberFiesta    2527 0.05300000 27.00000  <NA>
## 946     161 Kilt Lifter Scottish-Style Ale    1635 0.06000000 21.00000  <NA>
## 992     167                 The CROWLERâ„¢    1796 0.07755227 62.09427  <NA>
## 993     167           CAN'D AID Foundation    1790 0.05870227 41.53910  <NA>
##     Ounces               Brewery_Name         City State
## 227     16        Cedar Creek Brewery Seven Points    TX
## 455     12   Freetail Brewing Company  San Antonio    TX
## 946     12 Four Peaks Brewing Company        Tempe    AZ
## 992     32        Oskar Blues Brewery     Longmont    CO
## 993     12        Oskar Blues Brewery     Longmont    CO
#Internet search for all the missing styles above
Totaldata_imp[227,"Style"] = "English India Pale Ale (IPA)" #"https://irp-cdn.multiscreensite.com/b5112d09/files/uploaded/Beer%20Menu_2up%20%281%29-1.png"

Totaldata_imp[455,"Style"] = "Vienna Lager"  #https://www.craftbeer.com/styles/german-style-marzen-oktoberfest#:~:text=A%20beer%20rich%20in%20malt,similar%20to%20the%20Vienna%20lager.&text=Originating%20in%20Germany%2C%20this%20style,or%20lagered%2C%20throughout%20the%20summer.

Totaldata_imp[946,"Style"] = "Scotch Ale / Wee Heavy"  #based on name

Totaldata_imp[992,"Style"] = "German Pilsener" #https://www.oskarblues.com/beers/

Totaldata_imp[993,"Style"] = "German Pilsener" #https://www.oskarblues.com/beers/

Generating bar plots for Beerdata categorized by number of Ounces used to generate ABV

Totaldata_imp$Ounces <- as.factor(Totaldata_imp$Ounces)

#Summary count of number of ABV values by categorized Ounces of beer
s <- Totaldata_imp %>% group_by(Ounces)%>% summarize(ABV,count=n())
## `summarise()` regrouping output by 'Ounces' (override with `.groups` argument)
# Bar plot of ABV v. Ounces to see how much data is available
Totaldata_imp %>% ggplot(aes(x = Ounces, y = s$count[1], fill=Ounces)) + geom_col() +
  ggtitle("Alcohol By Volume (ABV) data count by Ounce Category") +
  labs(y="Alcohol By Volume (ABV)") + 
  geom_text(aes(Ounces, s$count+50, label = s$count, fill = NULL), data = s)

#Median for ABV and IBU content for each state (final plot given in below chunk)

#Evaluating median of ABV and IBU (all NA values removed from data)
Median_ABV <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>% 
  summarize(Median_ABV = median(ABV, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_IBU <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>% 
  summarize(Median_IBU = median(IBU, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV <- data.frame(Median_ABV)
Median_IBU <- data.frame(Median_IBU)
Median_ABV$Cat <- "Median_ABV"
Median_IBU$Cat <- "Median_IBU"


#Bar plot for Median ABV/IBU content for each state
Medianplot_ABV_IBU <- Median_ABV %>% ggplot(mapping = aes(x,y)) + 
  geom_bar(aes(x = State, y = Median_ABV, fill = State), group = Median_ABV$State, 
                                                   stat = 'identity', show.legend = FALSE) +
  ggtitle('Median ABV Content by State') + geom_text(aes(State, Median_ABV+0.005, 
                                                      label = percent(Median_ABV, accuracy = 0.01),
                                                      fill = NULL), data = Median_ABV, angle = 90) + 
  geom_bar(data=Median_IBU, aes(x = State, y = Median_IBU, fill = State), stat = 'identity', 
           show.legend = FALSE) +
  geom_text(aes(State, Median_IBU+2, label = round(Median_IBU,digits = 0), fill = NULL), data = Median_IBU) +
  ggtitle('Median ABV and Median IBU Content by State') + facet_grid(Cat~., scale = "free_y",
                                                              labeller = label_parsed, switch = "y") + labs(x="State") + 
  theme(axis.title.y = element_blank(), strip.placement = "outside",axis.title.x = element_text(size=12, face="bold", colour = "black"),    
axis.text.x = element_text(size=12, face="bold", colour = "black"), 
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)

Median plot for ABV and IBU by state

Medianplot_ABV_IBU 

#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)

From the median ABV and median IBU plot above it can be observed that ‘District of Columbia’ and ‘Kentucky’ have the highest median ABV values shown in percentage. ‘Maine’ and ‘West Virginia’ have the highest median IBU values.

Maximum ABV and IBU by State

#collecting max data
Max_ABV <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>% 
  summarize(Max_ABV = max(ABV, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_IBU <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>% 
  summarize(Max_IBU = max(IBU, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV$Cat <- "Max_ABV"
Max_IBU$Cat <- "Max_IBU"

#Bar plot for max ABV content for each state
Maxplot_ABV_IBU <- Max_ABV %>% ggplot(mapping = aes(x,y)) + 
  geom_bar(aes(x = State, y = Max_ABV, fill = State), group = Max_ABV$State, 
                                                   stat = 'identity', show.legend = FALSE) +
  ggtitle('Max ABV Content by State') + geom_text(aes(State, Max_ABV+0.0085, 
                                                      label = percent(Max_ABV, accuracy = 0.01),
                                                      fill = NULL), data = Max_ABV, angle = 90) + 
  geom_bar(data=Max_IBU, aes(x = State, y = Max_IBU, fill = State), stat = 'identity', 
           show.legend = FALSE) +
  geom_text(aes(State, Max_IBU+5, label = round(Max_IBU,digits = 0), fill = NULL), data = Max_IBU) +
  ggtitle('Max ABV and Max IBU Content by State') + facet_grid(Cat~., scale = "free_y",
                                                              labeller = label_parsed, switch = "y") + labs(x="State") + 
  theme(axis.title.y = element_blank(), strip.placement = "outside",axis.title.x = element_text(size=12, face="bold", colour = "black"),    
axis.text.x = element_text(size=12, face="bold", colour = "black"), 
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)

Maximum ABV and IBU vaule by state

Maxplot_ABV_IBU 

#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)

From the Max ABV and IBU plot it is observed that ‘Colorado’ has the max ABV value of 12.8% and ‘Oregon’ has the max IBU value of 138.

ABV statistical attributes

Stats <- Totaldata_imp %>% summarize(Mean = mean(ABV, na.rm=TRUE),
                        Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
                        SD = sd(ABV,na.rm=T), N = n())
#Histogram and Density Plot
His_Den <- Totaldata_imp %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) +
  geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
  geom_density(alpha=.4, fill='#FFFF00') + 
  ggtitle('ABV Statistical Attributes - Histogram, Density and Box Plots') + 
  scale_x_continuous(breaks = seq(0,0.14,0.01),labels = seq(0,0.14,0.01)) + labs(y="Density / Count")

#Box plot for ABV data
Box <- Totaldata_imp %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) + 
  geom_boxplot(col='black',fill='#FF6666') + scale_x_continuous(expand = c(0,0), limit = c(0,0.14)) + scale_y_continuous(expand = c(0,0), limit = c(-0.4,0.4))

#Histogram density plot and Box plot on same scale
grid.draw(rbind(ggplotGrob(His_Den),
                ggplotGrob(Box),
                size = "first"))

Stats
##         Mean Median   Max   Min         SD    N
## 1 0.05976762  0.057 0.128 0.001 0.01337577 2410

All statistically significant attributes for the ABV values are shown above The distribution of ABV is slightly right skewed. ABV values from 4.90% to 5.95% are the most widely used. 6.5% ABV values show second highest peak from the histogram. . There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%. The mean ABV is 5.98%, median 5.7% and standard deviation of ABV is 1.34%.

ABV v. IBU scatter plot

#Scatter plot with 
Totaldata_imp %>% filter(!is.na(ABV) & !is.na(IBU)) %>% 
  ggplot(aes(y=ABV, x=IBU)) + geom_point(aes(colour = ABV/IBU)) + geom_smooth(method=loess) + ggtitle("ABV V. IBU Plot") + scale_y_continuous(labels = percent)
## `geom_smooth()` using formula 'y ~ x'

The scatter plot indicates there is a moderately positive linear relationship (i.e., as IBU increases ABV increases). Most beers with lower IBU (less than 50) have ABV values around 5%. When IBU value increases, ABV values spreads out. But most beers with IBU values above 50, their ABV values spread out within the region between 5% and 10%.

ABV_IBU statistical significance with respect to IPAs and Ale’s

# ABV/ IBU for IPA
Data_IPA <- Totaldata_imp %>% 
  filter(str_detect(Style, regex(str_c('\\b','IPA','\\b',sep=''), ignore_case = T)))
#Stats IPA
Stats_IPA_ABV <- Data_IPA %>% summarize(Mean = mean(ABV, na.rm=TRUE),
                        Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
                        SD = sd(ABV,na.rm=T), N = n())
Stats_IPA_IBU <- Data_IPA %>% summarize(Mean = mean(IBU, na.rm=TRUE),
                        Median = median(IBU,na.rm=T), Max = max(IBU,na.rm=T), Min = min(IBU,na.rm=T),
                        SD = sd(IBU,na.rm=T), N = n())
#Histogram and Density Plot for IPA with ABV
His_Den_IPA_ABV <- Data_IPA %>% ggplot(aes(x=ABV)) +
  geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
  geom_density(alpha=.4, fill='#FFFF00') + 
  ggtitle('IPA - ABV Statistical Attributes - Histogram, Density') + 
  scale_x_continuous(breaks = seq(0,0.14,0.01),labels = seq(0,0.14,0.01)) + labs(y="Density / Count")
#Histogram and Density Plot for IPA with IBU
His_Den_IPA_IBU <- Data_IPA %>% ggplot(aes(x=IBU)) +
  geom_histogram(aes(y=..density..),colour='red',fill='pink') +
  geom_density(alpha=.2, fill='#FFFF00') + 
  ggtitle('IPA - IBU Statistical Attributes - Histogram, Density') + 
  scale_x_continuous() + labs(y="Density / Count")


#ABV / IBU for Ale
Data_Ale <- Totaldata_imp %>% 
  filter(str_detect(Style, regex(str_c('\\b','Ale','\\b',sep=''), ignore_case = T)))

Stats_Ale_ABV <- Data_Ale %>% summarize(Mean = mean(ABV, na.rm=TRUE),
                        Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
                        SD = sd(ABV,na.rm=T), N = n())
Stats_Ale_IBU <- Data_Ale %>% summarize(Mean = mean(IBU, na.rm=TRUE),
                        Median = median(IBU,na.rm=T), Max = max(IBU,na.rm=T), Min = min(IBU,na.rm=T),
                        SD = sd(IBU,na.rm=T), N = n())
#Histogram and Density Plot for Ale with ABV
His_Den_Ale_ABV <- Data_Ale %>% ggplot(aes(x=ABV)) +
  geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
  geom_density(alpha=.4, fill='#FFFF00') + 
  ggtitle('Ale - ABV Statistical Attributes - Histogram, Density') + 
  scale_x_continuous(breaks = seq(0,0.19,0.01),labels = seq(0,0.19,0.01)) + labs(y="Density / Count")
#Histogram and Density Plot for Ale with IBU
His_Den_Ale_IBU <- Data_Ale %>% ggplot(aes(x=IBU)) +
  geom_histogram(aes(y=..density..),colour='red',fill='pink') +
  geom_density(alpha=.2, fill='#FFFF00') + 
  ggtitle('Ale - IBU Statistical Attributes - Histogram, Density') + 
  scale_x_continuous() + labs(y="Density / Count")

#Fitting all the statistical data in one screen
require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
a <- grid.arrange(His_Den_IPA_ABV,His_Den_IPA_IBU,His_Den_Ale_ABV,His_Den_Ale_IBU,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Stats_IPA_ABV
##         Mean Median   Max   Min         SD   N
## 1 0.06860334  0.068 0.099 0.027 0.01234518 572
Stats_IPA_IBU
##       Mean Median Max       Min       SD   N
## 1 65.78195     65 138 0.4818612 20.66601 572
Stats_Ale_ABV
##        Mean Median   Max   Min        SD   N
## 1 0.0569628  0.055 0.099 0.035 0.0110365 978
Stats_Ale_IBU
##       Mean Median Max Min       SD   N
## 1 36.67542     35 115   4 16.64074 978

The above plots and values show IPA’s have higher ABV and IBU values than the Ale’s. This indicates that IPA’s are more alcoholic and more bitter than the Ale’s in general.

#ABV_IBU with respect to IPAs and Ale’s using kNN

#Checking to see number of rows in each IPA and Ale data
nrow(Data_IPA)
## [1] 572
nrow(Data_Ale)
## [1] 978
Data_IPA$Style1 <- "IPA" #Adding Style1 column for IPA style beer as IPA
Data_Ale$Style1 <- "Ale" #Adding Style1 column for Ale style beer as Ale
Data_IPA$Style1 <- as.factor(Data_IPA$Style1)
Data_Ale$Style1 <- as.factor(Data_Ale$Style1)

#Combining the IPA and Ale data to create a training set
Data_Ale_IPA <- rbind(Data_IPA, Data_Ale)

#Plot of all IPA and Ale data
Data_Ale_IPA %>% ggplot(aes(x=IBU, y=ABV)) + geom_point(aes(colour=Style1)) +
  theme(axis.title.x = element_text(size=12, face="bold", colour = "black"),    
axis.text.x = element_text(size=12, face="bold", colour = "black"),
axis.title.y = element_text(size=12, face="bold", colour = "black"),
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
) + ggtitle("ABV v. IBU based on Style") + labs(colour = "Style")

# kNN approach for 500 dataset iterations and 1-30 k values
iterations = 500 # number of iterations to test the k value
numks = 30      # number of k used in the iterations
splitPerc = .7 # split percentage assumed from the total dataset
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
trainIndices = sample(1:dim(Data_Ale_IPA)[1],round(splitPerc * dim(Data_Ale_IPA)[1]))
beer_train = Data_Ale_IPA[trainIndices,]
beer_test = Data_Ale_IPA[-trainIndices,]
#columns 4 and 5 represent the ABV and IBU values 
classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style1, prob = TRUE, k = 5)
CM = confusionMatrix(table(classifications,beer_test$Style1))
classifications
##   [1] IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA IPA Ale
##  [19] IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA Ale IPA Ale IPA Ale IPA IPA IPA
##  [37] IPA IPA Ale IPA IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA
##  [55] IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA Ale IPA IPA Ale IPA Ale IPA
##  [73] IPA IPA Ale IPA IPA IPA IPA Ale Ale IPA IPA IPA Ale IPA IPA IPA IPA IPA
##  [91] Ale Ale IPA Ale Ale IPA Ale Ale Ale IPA IPA IPA Ale Ale Ale IPA Ale Ale
## [109] IPA IPA Ale IPA Ale Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale Ale
## [127] Ale IPA IPA IPA Ale IPA IPA Ale IPA IPA IPA Ale Ale IPA Ale IPA Ale Ale
## [145] Ale Ale Ale IPA IPA Ale Ale IPA IPA IPA IPA Ale Ale Ale Ale Ale Ale Ale
## [163] Ale Ale Ale IPA IPA Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [181] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale IPA Ale Ale
## [199] Ale Ale IPA Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [217] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale IPA
## [235] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [253] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [271] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale IPA
## [289] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [307] Ale IPA Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale
## [325] Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [343] IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [361] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [379] Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale IPA IPA IPA
## [397] Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [415] Ale Ale IPA Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale
## [433] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale
## [451] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale
## attr(,"prob")
##   [1] 0.6000000 1.0000000 0.6000000 0.6000000 0.8000000 0.6000000 0.8750000
##   [8] 0.8888889 1.0000000 0.6000000 0.8235294 0.6000000 1.0000000 1.0000000
##  [15] 1.0000000 0.5000000 0.6000000 0.6000000 1.0000000 1.0000000 0.8333333
##  [22] 0.8000000 0.8888889 1.0000000 1.0000000 0.8000000 0.6000000 1.0000000
##  [29] 0.6000000 0.6000000 0.6000000 0.6000000 1.0000000 0.6666667 0.8000000
##  [36] 1.0000000 1.0000000 1.0000000 0.8333333 0.8000000 0.6000000 0.8000000
##  [43] 0.8000000 1.0000000 0.6000000 0.6000000 1.0000000 1.0000000 1.0000000
##  [50] 1.0000000 1.0000000 1.0000000 1.0000000 0.8571429 0.8333333 1.0000000
##  [57] 0.8000000 0.8000000 0.8000000 1.0000000 0.8000000 1.0000000 0.7500000
##  [64] 0.8000000 0.8000000 0.8000000 0.8000000 1.0000000 0.8000000 0.7777778
##  [71] 0.6000000 0.6000000 0.6000000 1.0000000 1.0000000 1.0000000 0.6000000
##  [78] 0.6000000 1.0000000 1.0000000 0.6000000 0.6000000 0.8000000 1.0000000
##  [85] 0.6000000 0.6000000 0.8000000 0.8571429 0.8571429 1.0000000 0.8000000
##  [92] 1.0000000 1.0000000 0.6666667 0.7500000 1.0000000 0.6666667 0.6666667
##  [99] 0.6666667 0.6000000 0.8571429 0.8333333 0.8235294 0.6000000 1.0000000
## [106] 0.8571429 0.8000000 0.8000000 0.8000000 0.8000000 0.6000000 0.8000000
## [113] 0.6000000 0.8333333 0.8235294 0.6666667 1.0000000 1.0000000 1.0000000
## [120] 1.0000000 0.9090909 0.8888889 1.0000000 1.0000000 1.0000000 0.7000000
## [127] 0.6000000 1.0000000 0.8888889 0.6000000 0.6000000 0.8000000 0.9090909
## [134] 0.8000000 1.0000000 1.0000000 0.8000000 0.8000000 0.6000000 0.8000000
## [141] 0.6000000 0.7142857 0.6000000 0.6000000 0.6000000 0.6000000 0.6000000
## [148] 1.0000000 1.0000000 0.6000000 0.8235294 0.8000000 0.7142857 0.8333333
## [155] 1.0000000 0.6000000 1.0000000 0.8000000 0.6000000 0.8000000 0.6000000
## [162] 1.0000000 1.0000000 1.0000000 0.6000000 0.7777778 0.6666667 0.8750000
## [169] 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 1.0000000 1.0000000
## [176] 0.6666667 0.8000000 0.8000000 0.6000000 0.6666667 1.0000000 0.8000000
## [183] 0.6666667 1.0000000 1.0000000 0.6000000 0.8000000 0.6000000 1.0000000
## [190] 1.0000000 1.0000000 0.8000000 0.6000000 1.0000000 1.0000000 1.0000000
## [197] 0.6000000 0.6000000 1.0000000 0.6000000 0.6000000 1.0000000 1.0000000
## [204] 0.8571429 1.0000000 0.6000000 1.0000000 1.0000000 0.8333333 0.6000000
## [211] 1.0000000 0.6000000 1.0000000 1.0000000 0.8000000 0.6000000 0.6000000
## [218] 1.0000000 0.6666667 0.7000000 1.0000000 1.0000000 1.0000000 1.0000000
## [225] 0.6250000 0.8333333 1.0000000 0.6666667 1.0000000 0.6000000 0.6000000
## [232] 1.0000000 0.6000000 0.6666667 0.6000000 1.0000000 0.6666667 1.0000000
## [239] 1.0000000 1.0000000 0.7500000 0.6250000 0.6250000 1.0000000 0.8000000
## [246] 1.0000000 0.6666667 1.0000000 0.8235294 0.8000000 0.8000000 1.0000000
## [253] 0.8333333 0.6666667 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000
## [260] 1.0000000 0.8000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [267] 0.8000000 0.6000000 1.0000000 0.8000000 1.0000000 0.8000000 0.8000000
## [274] 0.8000000 0.6000000 1.0000000 0.8333333 1.0000000 0.8571429 0.6666667
## [281] 1.0000000 1.0000000 1.0000000 0.8235294 0.8000000 0.8000000 1.0000000
## [288] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [295] 1.0000000 1.0000000 1.0000000 0.8000000 0.8000000 1.0000000 1.0000000
## [302] 0.8333333 1.0000000 1.0000000 1.0000000 0.8000000 0.6666667 0.8000000
## [309] 0.8000000 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000
## [316] 0.5714286 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 0.8000000
## [323] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000
## [330] 1.0000000 0.6000000 0.8000000 1.0000000 0.6666667 0.6000000 1.0000000
## [337] 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8333333
## [344] 1.0000000 0.6000000 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000
## [351] 0.8571429 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 0.6666667
## [358] 1.0000000 0.8000000 1.0000000 0.8000000 1.0000000 1.0000000 0.8333333
## [365] 1.0000000 0.6000000 1.0000000 0.8000000 1.0000000 0.8333333 1.0000000
## [372] 0.8333333 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [379] 0.8000000 0.6000000 0.8000000 0.8000000 0.5555556 0.8000000 1.0000000
## [386] 1.0000000 0.8000000 1.0000000 0.6000000 1.0000000 0.6000000 1.0000000
## [393] 1.0000000 0.8000000 0.6000000 0.8000000 0.8000000 1.0000000 0.5555556
## [400] 1.0000000 0.8000000 0.6000000 1.0000000 1.0000000 0.8000000 0.8000000
## [407] 0.8000000 1.0000000 0.6666667 1.0000000 0.8000000 0.8000000 0.8000000
## [414] 0.8000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [421] 0.8000000 0.6000000 1.0000000 0.6666667 0.6000000 0.6000000 1.0000000
## [428] 1.0000000 0.6000000 1.0000000 0.6666667 1.0000000 1.0000000 1.0000000
## [435] 1.0000000 0.8000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [442] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.8235294
## [449] 0.8750000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [456] 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 0.8000000 0.7142857
## [463] 1.0000000 1.0000000 0.8000000
## Levels: IPA Ale
CM
## Confusion Matrix and Statistics
## 
##                
## classifications IPA Ale
##             IPA 104  33
##             Ale  52 276
##                                          
##                Accuracy : 0.8172         
##                  95% CI : (0.779, 0.8513)
##     No Information Rate : 0.6645         
##     P-Value [Acc > NIR] : 1.624e-13      
##                                          
##                   Kappa : 0.5773         
##                                          
##  Mcnemar's Test P-Value : 0.05089        
##                                          
##             Sensitivity : 0.6667         
##             Specificity : 0.8932         
##          Pos Pred Value : 0.7591         
##          Neg Pred Value : 0.8415         
##              Prevalence : 0.3355         
##          Detection Rate : 0.2237         
##    Detection Prevalence : 0.2946         
##       Balanced Accuracy : 0.7799         
##                                          
##        'Positive' Class : IPA            
## 
#Above Confusion Matrix and Statistics show a accuracy of ~80%
for(j in 1:iterations)
{
  accs = data.frame(accuracy = numeric(30), k = numeric(30))
  trainIndices = sample(1:dim(Data_Ale_IPA)[1],round(splitPerc * dim(Data_Ale_IPA)[1]))
  beer_train = Data_Ale_IPA[trainIndices,]
  beer_test = Data_Ale_IPA[-trainIndices,]
  for(i in 1:numks)
  {
    classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style1, prob = TRUE, k = i)
    table(classifications,beer_test$Style1)
    CM = confusionMatrix(table(classifications,beer_test$Style1))
    masterAcc[j,i] = CM$overall[1]
  }
}
MeanAcc = colMeans(masterAcc)
p = ggplot(mapping = aes(x = seq(1,numks,1), y = MeanAcc)) + geom_line() + ggtitle("Mean Accuracy v. Number of k") + xlab('k values')
ggplotly(p)

Considering the IPA and Ale data using 70%/30% split and shuffling test and training data 500 times, with K assigned from 1 to 30 during each shuffling, we can achieve the highest mean accuracy 80.0% when K=5.

ABV_IBU with respect to IPAs and Ale’s using Naive Bayes

iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7 
for(j in 1:iterations)
{
  
  trainIndices = sample(1:dim(Data_Ale_IPA)[1],round(splitPerc * dim(Data_Ale_IPA)[1]))
  beer_train = Data_Ale_IPA[trainIndices,]
  beer_test = Data_Ale_IPA[-trainIndices,]
  model = naiveBayes(beer_train[,c(4,5)],as.factor(beer_train$Style1),laplace = 1)
  table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style1))
  CM = confusionMatrix(table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style1)))
  masterAcc[j] = CM$overall[1]
  masterSen[j] = CM$byClass[1]
  masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.7944387
MeanSen
## [1] 0.6708932
MeanSpec
## [1] 0.8668096

Naive Bayes model run for 500 iterations and 70/30% split gave the Mean accuracy (~79%), Mean sensitivity (~67%) and Mean specificity (~86%) as shown from the values above. The data is close match with kNN model.

Please go through the presentation final conclusion for the entire data.